import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import timeit
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
import random
from fancyimpute import KNN,mice,MICE
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error,mean_absolute_error, accuracy_score
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from xgboost import XGBRegressor,XGBClassifier
from xgboost import plot_importance
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import metrics
from keras import backend as K
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.cluster import KMeans
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.base import TransformerMixin, BaseEstimator
class CustomImputer(BaseEstimator, TransformerMixin):
def __init__(self, strategy='mean',filler='NA'):
self.strategy = strategy
self.fill = filler
def fit(self, X, y=None):
#if self.strategy in ['mean','median']:
# if not all(X.dtypes == np.number):
# raise ValueError('dtypes mismatch np.number dtype is \
# required for '+ self.strategy)
if self.strategy == 'mean':
self.fill = X.mean()
elif self.strategy == 'median':
self.fill = X.median()
elif self.strategy == 'mode':
self.fill = X.mode().iloc[0]
elif self.strategy == 'fill':
if type(self.fill) is list and type(X) is pd.DataFrame:
self.fill = dict([(cname, v) for cname,v in zip(X.columns, self.fill)])
return self
def transform(self, X, y=None):
return X.fillna(self.fill)
# the whole dataset
df = pd.read_csv("MEPS_all_feature_filter0.8.csv")
df = df.select_dtypes(exclude=['object'])
df = df.iloc[:,5:]
df.shape
# disease/diagnostic variables
df_health = df.loc[:,"RTHLTH31":"DSFLNV53"]
print(df_health.shape)
# pre_selected features, for example BMI, poverty, education, insurance edc
df_exist_f = pd.read_csv("MEPS_select_add_feature_filter0.8_nototexp15.csv")
df_exist_f.shape
# combine health and preselected feature
# this is our feature matrix for use
df_c = pd.concat([df_exist_f, df_health],axis =1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]### Feature matrix
df_c.shape
#num/cat variables # briefly select categorical variables; notice this not a presise way
l={}
for x in df.columns:
l[x] = (len(df[str(x)].unique()))
cat_columns = []
num_columns = []
for k,v in l.items():
if v<=6:
cat_columns.append(k)
else:
num_columns.append(k)
# IMPUTATE
tot_list = list(df_c)
cat = [x for x in tot_list if x not in num_columns]
df_num_ord = df.filter(items = num_columns)
df_cat = df.filter(items = cat)
#################
#for categorical
print("imputing variables...")
df_cat = df_cat.fillna(0)
#################
#for numerical
df_num_ord=pd.DataFrame(data=KNN(k=3).complete(df_num_ord), columns=df_num_ord.columns, index=df_num_ord.index)
#df_num_ord = df_num
#### one hot categorical/ concatenate to num
print("encoding categorical variables...")
enc = OneHotEncoder()
enc.fit(df_cat)
encoded=pd.DataFrame(enc.transform(df_cat).toarray())
df_preprocessed = pd.concat([encoded,df_num_ord],axis =1)
# df_preprocessed cat->encoded num->knn
df_preprocessed.shape
tot_list = list(df_c)
cat = [x for x in tot_list if x not in num_columns]
df_num_ord = df_c.filter(items = num_columns)
df_cat = df_c.filter(items = cat)
#################
#for categorical
print("imputing variables...")
df_cat = df_cat.fillna(0)
#################
#for numerical
df_num_ord=pd.DataFrame(data=KNN(k=3).complete(df_num_ord), columns=df_num_ord.columns, index = df_num_ord.index)
#df_num_ord = df_num
#### one hot categorical/ concatenate to num
print("encoding categorical variables...")
enc = OneHotEncoder()
enc.fit(df_cat)
# for mice imputation;
# notice MULTI REGRESSION IMPUTATION METHODS generally require random assumption
# it's likely no the case;
# not sure if this is adding more noise; should be; will check by model perform
encoded=pd.DataFrame(enc.transform(df_cat).toarray())
df_preprocessed = pd.concat([encoded,df_num_ord],axis =1)
# 105
disease_f = ["CANCERDX", "CHDDX","STRKDX","OHRTDX","HIBPDX","EMPHDX","DIABDX",
#cancer/ coronary heart disease/ stroke/ other heart disease/ high blood pressure/ Emphysema肺气肿/Diabetes糖尿病
"ARTHDX", "ASTHDX", "ADHDADDX","PREGNT31","IADLHP31","ADLHLP31","AIDHLP31"]
# arthrits关节炎/asthma/ 多动症/ pregnant / independent living screener
# count plot for each disease
for i, x in enumerate(disease_f):
plt.figure(i)
print(x)
sns.countplot(df[x].fillna(3))
plt.show()
# plot expenditure across each disease
log_totexp = np.log(df.TOTEXP15+1)
df_explog = pd.concat([df.drop(["TOTEXP15"], axis=1), log_totexp], axis =1)
for i, cols in enumerate(disease_f):
plt.figure(i)
sns.violinplot(x=cols, y="TOTEXP15", data=df_explog[df_explog.TOTEXP15 >0] .fillna(3)).set_title(str(cols))
plt.show() # this is different than 6.11plot as is acroos dis level
#######
###
df_mean = df.groupby(['CANCERDX']).mean()
print(df_mean.OBVEXP15) # office
print(df_mean.OPVEXP15) # out patient
print(df_mean.ERTEXP15) # emergency profit
print(df_mean.TOTEXP15) # total
df_mean = df.groupby(['PREGNT31']).mean()
print(df_mean.OBVEXP15) # office based
print(df_mean.OPVEXP15) # out patient
print(df_mean.ERTEXP15) # emergency room expenditure
print(df_mean.TOTEXP15) # total
df_mean = df.groupby(['STRKDX']).mean() #stroke
print(df_mean.OBVEXP15) # office
print(df_mean.OPVEXP15) # out patient
print(df_mean.ERTEXP15) # emergency profit
print(df_mean.TOTEXP15) # total
#
print("-----------------")
#
df_mean = df.groupby(['CHDDX']).median()
print(df_mean.OBVEXP15)
df_mean = df.groupby(['DIABDX']).mean() #diabetes
print(df_mean.OBVEXP15) # office
print(df_mean.OPVEXP15) # out patient
print(df_mean.ERTEXP15) # emergency profit
print(df_mean.TOTEXP15) # total
# correlation matrix
df_exp = df.loc[:,"TOTEXP15":"RXEXP15"]
df_check_cor = pd.concat([df_c, df_exp.TOTEXP15],axis =1)
df_check_cor = df_check_cor.loc[:,~df_check_cor.columns.duplicated()]
df_check_cor = df_check_cor.filter(items = num_columns)
df_check_cor.rename(columns={"TOTEXP15": "Total_Expenditure", "RXTOT15": "Total Office_Based Visits",
"OBTOTV15": "Total Out_patients Visits", "TRBLE42": "Total Prescribed Medcine",
"HAVFUN42": "Age", "HOMEBH42": "Attitude towards Insurance",
"ADAPPT42": "Family Income Index", "ERTOT15": "TOTAL Emergency Room Visits",
"OPTOTV15": "BMI", "KIDPRO42": "Perceived Health Status",
"OBOTHV15": "Working Hours", "ASTHAGED": "Cancer Diagnosis",
"AGELAST": "Office Based Non-Physician Visits", "AGE53X": "Highest Education"}, inplace=True)
corr = df_check_cor.corr()
corr
corr.Total_Expenditure.sort_values(ascending=False)
corr.Total_Expenditure.sort_values(ascending=False)
#plot corr matrix
corr = corr.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = 1
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 6))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=0.8,vmin=-0.2, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .6})
df_base = pd.concat([df_c, df["TOTEXP15"]],axis =1)
df_base = df_base.loc[:,~df_base.columns.duplicated()]
df_base = df_base.loc[(df_base["TOTEXP15" ])]
#df_xgb_base = df_base
df_xgb_base = df_base.fillna(0)
df_xgb_base = df_xgb_base.replace([np.inf, -np.inf], 0)
x = df_xgb_base.drop(['TOTEXP15'], axis=1)
y = np.log(df_xgb_base["TOTEXP15"]+1)
#x = preprocessing.scale(x)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state = 6)
#model fit
xgb_model = XGBRegressor()
xgb_m = GridSearchCV(xgb_model,
{'max_depth': [3,4,5],
'n_estimators': [50,75,100],
"learning_rate": [0.1,0.2,0.3]
}, verbose=1, n_jobs=-1, cv=3)
xgb_m = xgb_m.fit(X=X_train,y=y_train)
train_mae = mean_absolute_error(y_train, xgb_m.predict(X_train))
test_mae = mean_absolute_error(y_test, xgb_m.predict(X_test))
#plt1
pred = xgb_m.predict(X_test)
true = y_test
f, ax = plt.subplots(figsize=(6, 6))
ax.scatter(true, pred )
ax.set_xlim([2, 12])
ax.set_ylim([2, 12])
plt.title("Predict vs True")
plt.xlabel("true_value")
plt.ylabel("predicted_value")
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".2")
plt.show()
#plt2
params = xgb_m.best_params_
model = XGBRegressor(params["max_depth"], params["learning_rate"], params["n_estimators"])
model.fit(X_train, y_train)
plt.figure()
xgb.plot_importance(model, max_num_features=15)
plt.show()
print (xgb_m.best_params_)
ret={}
ret["test R^2"] = xgb_m.best_score_
ret["training R^2"] = xgb_m.score(X=X_train,y=y_train)
ret["test mae"] = test_mae
ret["training mae"] = train_mae
ret["observations_count"] = len(y)
print(ret)
'''
for x in ret:
print(x)
print("-----")
'''
## notice self imputed data performs worse than xgb imputed num
df_base = pd.concat([df_preprocessed, df["TOTEXP15"]],axis =1)
df_base = df_base.loc[:,~df_base.columns.duplicated()]
df_base = df_base.loc[(df_base["TOTEXP15" ])]
df_xgb_base = df_base.fillna(0)
df_xgb_base.replace([np.inf, -np.inf], 0)
y = np.log(df_xgb_base["TOTEXP15"]+1)
x = df_xgb_base.drop(['TOTEXP15'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state = 6)
#model fit
xgb_model = XGBRegressor()
xgb_m = GridSearchCV(xgb_model,
{'max_depth': [3,4,5],
'n_estimators': [50,75],
"learning_rate": [0.1,0.2,0.3]
}, verbose=1, n_jobs=-1, cv=3)
xgb_m = xgb_m.fit(X=X_train,y=y_train)
train_mae = mean_absolute_error(y_train, xgb_m.predict(X_train))
test_mae = mean_absolute_error(y_test, xgb_m.predict(X_test))
#plt1
pred = xgb_m.predict(X_test)
true = y_test
f, ax = plt.subplots(figsize=(6, 6))
ax.scatter(true, pred )
ax.set_xlim([2, 12])
ax.set_ylim([2, 12])
plt.title("Predict vs True")
plt.xlabel("true_value")
plt.ylabel("predicted_value")
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".2")
plt.show()
#plt2
params = xgb_m.best_params_
model = XGBRegressor(params["max_depth"], params["learning_rate"], params["n_estimators"])
model.fit(X_train, y_train)
plt.figure()
xgb.plot_importance(model, max_num_features=15)
plt.show()
print (xgb_m.best_params_)
ret={}
ret["test R^2"] = xgb_m.best_score_
ret["training R^2"] = xgb_m.score(X=X_train,y=y_train)
ret["test mae"] = test_mae
ret["training mae"] = train_mae
ret["observations_count"] = len(y)
print(ret)
#####
# fit model/plot
def DvsE_out_plot(d,e,df_input,df_all):
'''
disease vs expenditure;
input:
df_input is feature df which contains feacture of interests
df_all is the whole df for extracting disease and expenditure;
return:
(plot feature importance to explore)
train/ test R^2 and MAE, and sample counts
'''
#input
df_c = df_input
df=df_all
df_c = pd.concat([df_c, df[e]],axis =1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]
df_cheart = df_c.loc[(df_c[d] == 1) & (df_c[e] > 0) ]
#preprocess
y = np.log(df_cheart[e]+1)
if len(y) < 10:
ret = "observation count smaller than 10"
print(d,"obervation count smaller than 10")
return ret
x = df_cheart.drop([e], axis=1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 6)
#model fit
xgb_model = XGBRegressor()
xgb_m = GridSearchCV(xgb_model,
{'max_depth': [2,3,5],
'n_estimators': [25,45,65],
"learning_rate": [0.1,0.2,0.3]
}, verbose=1, n_jobs=-1, cv=3)
xgb_m = xgb_m.fit(X=X_train,y=y_train)
train_mae = mean_absolute_error(y_train, xgb_m.predict(X_train))
test_mae = mean_absolute_error(y_test, xgb_m.predict(X_test))
#plt1
pred = xgb_m.predict(X_test)
true = y_test
f, ax = plt.subplots(figsize=(6, 6))
ax.scatter(true, pred )
ax.set_xlim([0, 12])
ax.set_ylim([0, 12])
plt.title("Predict vs True")
plt.xlabel("true_value")
plt.ylabel("predicted_value")
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
plt.show()
#plt2
params = xgb_m.best_params_
model = XGBRegressor(params["max_depth"], params["learning_rate"], params["n_estimators"])
model.fit(X_train, y_train)
plt.figure()
xgb.plot_importance(model, max_num_features=15)
plt.show()
#return
print (xgb_m.best_params_)
ret={}
ret["test R^2"] = xgb_m.best_score_
#ret["training R^2"] = xgb_m.score(X=X_train,y=y_train)
#ret["test mae"] = test_mae
#ret["training mae"] = train_mae
ret["observations_count"] = len(y)
return ret
df_input = df_c
df_all = df
# explore coronary heart disease vs TOTEXP
DvsE_out_plot('CHDDX','TOTEXP15',df_input,df_all)
DvsE_out('CHDDX','TOTEXP15',df_input,df_all)
# explore coronary heart disease vs office based exp
DvsE_out_plot('CHDDX','OBVEXP15',df_input,df_all)
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('CHDDX','OPVEXP15',df_input,df_all)
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('CHDDX','RXEXP15',df_input,df_all)
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('PREGNT31','TOTEXP15',df_input,df_all)
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('PREGNT31','OBVEXP15',df_input,df_all)
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('PREGNT31','OPVEXP15',df_input,df_all)
DvsE_out_plot('PREGNT31','RXEXP15',df_input,df_all)
def DvsE_cls(d,e,df_input,df_all):
#input
df_c = df_input
df=df_all
df_c = pd.concat([df_c, df[e]],axis =1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]
df_cheart = df_c.loc[(df_c[d] == 1)]
#preprocess
y = (df_cheart[e])
#print(y)
y = pd.cut(y,3, labels=["low","medium","high"]) ## low value vs high value
#print(y)
x = df_cheart.drop([e], axis=1)
#x = preprocessing.scale(x)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 6)
#print(y_train)
#print(y_test)
xgb_model = XGBClassifier()
xgb_m = GridSearchCV(xgb_model,
{'max_depth': [3,4,5],
'n_estimators': [25,50,75],
"learning_rate": [0.1,0.2,0.3]
}, verbose=1, n_jobs=-1, cv=3)
xgb_m = xgb_m.fit(X=X_train,y=y_train)
test_acc = accuracy_score(y_test, xgb_m.predict(X_test))
ret_dic = {}
ret_dic["train_acc"] = xgb_m.best_score_
ret_dic["test_acc"] = test_acc
ret_dic["parameter"] = xgb_m.best_params_
print(len(y))
return ret_dic
print(DvsE_cls('CANCERDX','TOTEXP15',df_c,df))
print(DvsE_cls('CHDDX','OBVEXP15',df_c,df))
disease_f = ["CANCERDX", "CHDDX","STRKDX","OHRTDX","HIBPDX","EMPHDX","DIABDX",
#cancer/ coronary heart disease/ stroke/ other heart disease/ high blood pressure/ Emphysema肺气肿/Diabetes糖尿病
"ARTHDX", "ASTHDX", "ADHDADDX","PREGNT31","IADLHP31"]
# arthritis关节炎/asthma/ 多动症/ pregnant / independent living screener
exp_l = ["TOTEXP15", "OPTEXP15", "OBVEXP15", "HHAEXP15", "ERTEXP15","IPTEXP15", "RXEXP15"]
def summary_y(d,e,df_input,df_all):
'''
disease vs expenditure;
df_input is feature df
df_all is the whole df for extracting d and e;
return train/ test R^2 and MAE.
'''
#input
df_c = df_input
df=df_all
df_c = pd.concat([df_c, df[e]],axis =1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]
df_cheart = df_c.loc[(df_c[d] == 1) & (df_c[e] >0)]
#preprocess
y = df_cheart[e]
#plt.figure()
# sns.distplot(y)
# plt.show()
plt.figure()
sns.distplot(np.log(y+10))
plt.show()
if len(y) < 10:
ret = "observation count smaller than 10"
print(d,"obervation count smaller than 10")
return ret
x = df_cheart.drop([e], axis=1)
return y.describe()
#emphdx hhaexp
#
out_dic_summary ={}
n=0
for d in disease_f:
out_dic_summary[d] = {}
for e in exp_l:
print("----------------------------------")
print(n)
print(d)
print(e)
n+=1
try :
out_dic_summary[d][e] = summary_y(d,e,df_c,df)
print(out_dic_summary[d][e])
except:
out_dic[d][e] = "NaN"
outt_df_summary = pd.DataFrame(out_dic_summary)
outt_df_summary = outt_df_summary.transpose()
outt_df_summary.to_excel("MEPS_Disease_Expenditure_stats_summary_7.12.xlsx")
#####
#out model performance
out_dic_summary ={}
n=0
for d in disease_f:
out_dic_summary[d] = {}
for e in exp_l:
print("----------------------------------")
print(n)
print(d)
print(e)
n+=1
try :
out_dic_summary[d][e] = DvsE_out_plot(d,e,df_c,df)
print(out_dic_summary[d][e])
except:
out_dic_summary[d][e] = "NaN"
outt_df_summary = pd.DataFrame(out_dic_summary)
outt_df_summary = outt_df_summary.transpose()
outt_df_summary.to_excel("MEPS_Disease_Expenditure_model_performance_7.12.xlsx")
## overfitting problem: large feature size and small sample size?
## how we treat 0 in response
## compare to previous one where best model is test r^2 0.73 test MAE 0.75; now the OBVEXP column model performs better,
## ERTOT(medcine), TOTEXP are ok; others sample size small
# obtotv15 obvexp15
corr
#### cluster disease
disease_f = ["CANCERDX", "CHDDX","STRKDX","OHRTDX","HIBPDX","EMPHDX","DIABDX",
#cancer/ coronary heart disease/ stroke/ other heart disease/ high blood pressure/ Emphysema肺气肿/Diabetes糖尿病
"ARTHDX", "ASTHDX"
, "ADHDADDX","PREGNT31"]
# arthrits关节炎/asthma/ 多动症/ pregnant / independent living screener
# look for diagnosis features
hl={}
for x in df_health.columns:
hl[x] = (len(df[str(x)].unique()))
hcat_columns = []
hnum_columns = []
for k,v in hl.items():
if v<=3 and ("DX" in k):
hcat_columns.append(k)
else:
hnum_columns.append(k)
hcat_columns = list(set(hcat_columns).union(set(disease_f)))
hcat_columns.remove("BPMLDX")
len(hcat_columns)
#
df_dig = df[hcat_columns]
df_dig_corplot.columns
'''
df_dig = df_dig.fillna(1.5)
df_dig_corplot = df_dig.fillna(0)
df_dig_corplot.columns=["Pregnant", "Emphysema","Arthritis",
"High Cholesterol","Heart Attack ", "Cancer","Diabetes", "High Blood Pressure","Asthma",
"Attention Disorder",
"Coronary Heart Disease", "Other Heart Disease","Stroke", "Angina",'sum_disease', 'cluster',
'TOTEXP15', 'sum_HBP']
X=df_dig
X.shape
'''
kmeans = KMeans(n_clusters=4, random_state=0).fit(df_dig)
'''
bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
'''
df_dig['Total_Disease_Counts'] = (df_dig[hcat_columns] == 1).sum(axis=1)
df_dig["cluster"] = kmeans.labels_
#df_dig["cluster"] = ms.labels_
df_dig.columns=["Pregnant", "Emphysema","Arthritis",
"High Cholesterol","Heart Attack ", "Cancer","Diabetes", "High Blood Pressure","Asthma",
"Attention Disorder",
"Coronary Heart Disease", "Other Heart Disease","Stroke", "Angina","sum_disease","cluster"]
df_dig = pd.concat([df_dig, df.TOTEXP15],axis =1)
df_dig.columns
df_mean = df_dig.groupby(['cluster']).mean()
print("mean across different clusters: ")
print(df_mean.TOTEXP15)
print("median across different clusters: ")
df_median = df_dig.groupby(['cluster']).median()
print(df_median.TOTEXP15)
df_dig[['sum_disease',"cluster"]].groupby(['cluster']).agg(['mean'])
df_mean = df_dig[['sum_disease',"cluster"]].groupby(['cluster']).mean()
print("mean across different clusters: ")
print(df_mean.sum_disease)
df_mean = df_dig[['sum_disease',"cluster"]].groupby(['cluster']).mean()
print("mean of sum_disease across clusters: ")
print(df_mean.sum_disease)
df_mean = df_dig.groupby(['cluster']).mean()
print("mean of TOTEXP across clusters: ")
print(df_mean.TOTEXP15)
#df_dig.sort_values(by=['cluster'])
corr = df_dig.corr()
corr
corr.Diabetes.sort_values(ascending=False) #高血压 糖尿病 心脏病 ,heart attack,stroke, ANGINA
df_dig['sum_HBP'] = (df_dig[["High Blood Pressure","Diabetes","Coronary Heart Disease","Stroke"]] == 1).sum(axis=1)
#corr.CHDDX.sort_values(ascending=False)
#corr.CHDDX.sort_values(ascending=False) #coronary heart disease #doc 7.13: prem 700 <->1.2k
#corr.DIABDX.sort_values(ascending=False)
# diabetes vs chddx vs cholestrol vs heart attack vs othear kinds of haert disease vs angina
corr.TOTEXP15.sort_values(ascending=False) #cancer
corr.columns
#corr.TOTEXP15.sort_values(ascending=False) #total exp
corr.HIBPDX .sort_values(ascending=False)
# 1.people with diabetes are twice likely to get coronary disease and stroke than people without diebetes
# 2.cancer vs lung disesea
# ----------
# is it poss
df_dig.sum_disease.describe()
#df_dig.iloc[:,-:]
corr.columns
ax = sns.distplot(df_dig.sum_disease, kde=False)
ax.set(xlabel='Number_Disease_Diagnosed', ylabel='Count')
ax.set(title = "Patients with Multiple Disease")
#sns.distplot(df_dig.sum_HBP, kde=False)
df_dig.query('DIABDX == 1').shape[0] #
df_dig.query('DIABDX == 1 & HIBPDX == 1').shape[0]
df_dig.query('DIABDX == 1 & (CHDDX == 1 | OHRTDX==1 |MIDX ==1) & HIBPDX == 1').shape[0]
df_dig.query('STRKDX == 1 & (CHDDX == 1 | OHRTDX==1 |MIDX ==1) & HIBPDX == 1 & DIABDX == 1').shape[0]
df_dig.query('DIABDX == 1 & CHDDX == 1').shape
df_dig.query('CHDDX == 1').shape
df_dig.query('CANCERDX == 1 & EMPHDX == 1').shape
df_dig.query('PREGNT31 == 1').shape
df_dig.query('HIBPDX == 1 & PREGNT31 == 1').shape
df_dig_corplott = df_dig_corplot.drop(['sum_disease', 'cluster',"sum_HBP","TOTEXP15"], axis=1)
corr = df_dig_corplott.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = 1
# plt corr heatmap
f, ax = plt.subplots(figsize=(8, 6))
cmap = sns.diverging_palette(230, 9, as_cmap=1)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1,vmin=-0.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .6})
# emphdc vs angindx
# diabdx vs hbp
# choldx vs strkdx
for cluster Notice that K-means gives good linear/corr with exp and sum of disease if we use mean_shifting for cluster, the linear relation is not that obvious; but still, corr is preserved
corr